################################################################################ 
#
# The distribution of hate speech and its implications for content moderation
# PSRM - Replication package
# Figures 1, C3, C4
#
################################################################################ 



library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(lubridate)

rm(list = setdiff(ls(), ls(pattern = "^wd|^setsave$")))

# load aggregated user info
df_medium <- read_csv(paste0(wd_data, "/medium_descriptives/df_medium.csv"))
twitter_ch <- read_csv(paste0(wd_data, "/medium_descriptives/bins_twitter_chde.csv"))
twitter_us <- read_csv(paste0(wd_data, "/medium_descriptives/bins_twitter_us.csv"))
twitter_ch_daily <- read_csv(paste0(wd_data, "/medium_descriptives/hatespeech_daily_n_de_only.csv"))
twitter_us_daily <- read_csv(paste0(wd_data, "/medium_descriptives/hatespeech_daily_n_80_int_tox_combo.csv"))
twitter_ch_gf <- read_csv(paste0(wd_data, "/medium_descriptives/bins_twitter_ch.csv"))
twitter_ch_daily_gf <- read_csv(paste0(wd_data, "/medium_descriptives/hatespeech_daily_n.csv"))


df_plot_bin <- df_medium %>%
  mutate(hs = ifelse(hsprob>0.85, "hs", "not hs")) %>%
  group_by(media_anon, userId) %>% 
  summarise(ncomments = n(),
            nhs = sum(hs == "hs", na.rm = TRUE)) %>% 
  mutate(totalhs = sum(nhs, na.rm = TRUE)) %>%
  mutate(prop_of_all_hs = nhs/totalhs) %>% 
  mutate(bin_share_of_all_hs = ntile(desc(prop_of_all_hs), 100)) %>% 
  group_by(media_anon, bin_share_of_all_hs) %>% 
  summarise(share_sum = sum(prop_of_all_hs),
            n_user = n_distinct(userId))

df_with_twitter <- df_plot_bin %>%
  mutate(media_anon = ifelse(media_anon == "M1", "Medium 1", 
                             ifelse(media_anon == "M2", "Medium 2", "Medium 3")),
         sample = "media") %>%
  bind_rows(twitter_ch %>%
              mutate(media_anon = "Twitter CH",
                     sample = "Twitter CH") %>%
              rename(bin_share_of_all_hs = bin,
                     share_sum = f_sum)) %>%
  bind_rows(twitter_us %>%
              select(bin, f_sum = f_sum_at_MAX_IDENTITY_BOTH_TOX_80, n_user = n_user_at_MAX_IDENTITY_BOTH_TOX_80) %>%
              mutate(media_anon = "Twitter US",
                     sample = "Twitter US") %>%
              rename(bin_share_of_all_hs = bin,
                     share_sum = f_sum)) 

## HS overall

twitter_ch_nhs <- sum(twitter_ch_daily$n[twitter_ch_daily$is_hatespeech==TRUE])
twitter_ch_n <- sum(twitter_ch_daily$n)

twitter_us_nhs <- sum(twitter_us_daily$n[twitter_us_daily$is_MAX_IDENTITY_BOTH_TOX_80==TRUE])
twitter_us_n <- sum(twitter_us_daily$n)

dfall_hs <- df_medium %>%
  group_by(media_anon) %>%
  summarise(nhs = sum(hs == "hs", na.rm = TRUE),
            n = n()) %>%
  mutate(media_anon = ifelse(media_anon == "M1", "Medium 1", 
                             ifelse(media_anon == "M2", "Medium 2", "Medium 3")),
         sample = "Media") %>%
  bind_rows(data.frame(media_anon = "Twitter CH",
                       nhs = twitter_ch_nhs,
                       n = twitter_ch_n,
                       sample = "Twitter CH")) %>%
  bind_rows(data.frame(media_anon = "Twitter US",
                       nhs = twitter_us_nhs,
                       n = twitter_us_n,
                       sample = "Twitter US")) %>%
  mutate(mean_hs = nhs/n*100,
         media_anon = factor(media_anon, levels = c("Medium 1", "Twitter CH", "Medium 2", "Twitter US", "Medium 3")))

df_forchart <- df_with_twitter %>%
  group_by(media_anon) %>%
  mutate(bin = rev(bin_share_of_all_hs)) %>%
  ungroup() %>%
  left_join(dfall_hs %>% select(media_anon, mean_hs), by = "media_anon") %>%
  mutate(media_anon = factor(media_anon, levels = c("Medium 1", "Twitter CH", "Medium 2", "Twitter US", "Medium 3")),
         media_label = paste0(media_anon, " (Prevalence: ", round(mean_hs, 2), "%)"),
         media_label = factor(media_label, levels = c("Twitter CH (Prevalence: 1.17%)", 
                                                      "Twitter US (Prevalence: 0.73%)", 
                                                      "Medium 1 (Prevalence: 0.83%)", 
                                                      "Medium 2 (Prevalence: 0.49%)", 
                                                      "Medium 3 (Prevalence: 0.27%)")),
         media_label_new = case_when(media_anon == "Medium 1" ~ "Newspaper 1",
                                     media_anon == "Medium 2" ~ "Newspaper 2",
                                     media_anon == "Medium 3" ~ "Newspaper 3",
                                     media_anon == "Twitter CH" ~ "Twitter CH",
                                     media_anon == "Twitter US" ~ "Twitter US"),
         media_label_new = factor(media_label_new, levels = c("Twitter CH", "Twitter US", "Newspaper 1", "Newspaper 2", "Newspaper 3")))

# Figure 1
df_forchart %>%
  filter(!(media_anon %in% c("Twitter US"))) %>%
  ggplot(aes(x = bin, y = share_sum)) +
  geom_bar(aes(fill = sample), stat = "identity") +
  scale_y_continuous(labels = scales::percent, limits = c(0,.8), 
                     expand = c(0,0)) +
  labs(x = "User Percentile (by Volume of Hate Speech Comments)",
       y = "Share of Total Hate Speech Comments (%)",
       color = "") +
  theme_minimal() + 
  scale_fill_manual(values = c("blue4", "slateblue1")) +
  facet_wrap(~media_label_new, nrow = 2) +
  theme(legend.position = "none")
if (setsave) ggsave(paste0(wd_res, "/figures/fig1.pdf"), height = 5, width = 8)

cat("\n====================\n")
cat("Saved Figure 1")
cat("\n====================\n")

# Figure C.3
df_forchart %>%
  filter(media_anon %in% c("Twitter US")) %>%
  ggplot(aes(x = bin, y = share_sum)) +
  geom_bar(aes(fill = sample), stat = "identity") +
  scale_y_continuous(labels = scales::percent, limits = c(0,.8), 
                     expand = c(0,0)) +
  labs(x = "User Percentile (by Volume of Hate Speech Comments)",
       y = "Share of Total Hate Speech Comments (%)",
       color = "") +
  theme_minimal() + 
  scale_fill_manual(values = c("slateblue1", "palevioletred1")) +
  facet_wrap(~media_label_new, nrow = 1) +
  theme(legend.position = "none")
if (setsave) ggsave(paste0(wd_res, "/figures/figC3.pdf"), height = 4, width = 5)
cat("\n====================\n")
cat("Saved Figure C3")
cat("\n====================\n")

### Numbers for the article

# Number of comments
length(unique(df_medium$id))

# Number of comments by registered users
length(unique(df_medium$id[!is.na(df_medium$userId)]))

# Number of comments by unregistered users - now 0
length(unique(df_medium$id[is.na(df_medium$userId)]))

# Number of users
length(unique(df_medium$userId[!is.na(df_medium$userId)]))

# Number of comments by medium
table(df_medium$media_anon)

# Number of comments and users by medium
length(unique(df_medium$id[!is.na(df_medium$userId) & df_medium$media_anon == "M1"]))
length(unique(df_medium$userId[!is.na(df_medium$userId) & df_medium$media_anon == "M1"]))

length(unique(df_medium$id[df_medium$media_anon == "M2"]))
length(unique(df_medium$id[!is.na(df_medium$userId) & df_medium$media_anon == "M2"]))
length(unique(df_medium$userId[!is.na(df_medium$userId) & df_medium$media_anon == "M2"]))
length(unique(df_medium$id[is.na(df_medium$userId) & df_medium$media_anon == "M2"]))

length(unique(df_medium$id[!is.na(df_medium$userId) & df_medium$media_anon == "M3"]))
length(unique(df_medium$userId[!is.na(df_medium$userId) & df_medium$media_anon == "M3"]))

# Root vs replies
df_medium <- df_medium %>%
  mutate(parentId = ifelse(parentId == "null", NA, parentId))
table(is.na(df_medium$parentId), df_medium$media_anon)
table(is.na(df_medium$parentId))
prop.table(table(is.na(df_medium$parentId)))
df_medium %>%
  group_by(media_anon) %>%
  mutate(totaln = n()) %>%
  ungroup() %>%
  group_by(media_anon, is.na(parentId)) %>%
  summarise(share = n()/unique(totaln)*100)

# Share by top 5%
df_forchart %>% 
  filter(bin > 95) %>% 
  group_by(media_anon) %>% 
  summarise(sum(share_sum))


# Figure C.4
twitter_summary_gf <- data.frame(
  media_anon = "Twitter CH (incl. French)",
  nhs = sum(twitter_ch_daily_gf$n[twitter_ch_daily_gf$is_hatespeech==TRUE]),
  n = sum(twitter_ch_daily_gf$n)) %>%
  mutate(mean_hs = nhs/n*100)

twitter_chart_gf <- twitter_ch_gf %>%
  mutate(media_anon = "Twitter CH (incl. French)",
         bin = rev(bin)) %>%
  rename(share_sum = f_sum) %>%
  left_join(twitter_summary_gf %>% select(media_anon, mean_hs), by = "media_anon") %>%
  mutate(media_label = paste0(media_anon, " (Prevalence: ", round(mean_hs, 2), "%)"))

twitter_chart_gf %>%
  ggplot(aes(x = bin, y = share_sum)) +
  geom_bar(aes(fill = media_anon), stat = "identity") +
  scale_y_continuous(labels = scales::percent, limits = c(0,.8), 
                     expand = c(0,0)) +
  labs(x = "User Percentile (by Volume of Hate Speech Comments)",
       y = "Share of Total Hate Speech Comments (%)",
       color = "") +
  theme_minimal() + 
  scale_fill_manual(values = c("slateblue1")) +
  facet_wrap(~media_anon) +
  theme(legend.position = "none")
ggsave(paste0(wd_res, "/figures/figC4.pdf"), height = 4, width = 5)

cat("\n====================\n")
cat("Saved Figure C4")
cat("\n====================\n")

